本项目是Kaggle上的一个已经结束的比赛项目。
# 下载项目公共数据集
!wget -c "https://storage.googleapis.com/kaggle-competitions-data/kaggle/5441/train.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1529407270&Signature=TTmj2gZCjWT%2FjzsB1hBLfYURi6aN6LKWNDnKYbW4efA8WYi3D2fvUuO0tGIPtGh5SjujCxf64XKDwa0WytUPwc8r4EGOeOGkuXW%2F4ADphxtvIUyFLLp6A0A%2B3bUWvHE91Ueth%2F%2FA8V669Az%2BcSPN4Xl0Sfe5ESkhxJwG4dBIOowWJrR%2BFGtoUpq%2FTgbxvmR1azV4Z2qXuKDVlEm1PMTMQ6w0YpgWOhf7VSq6EeHMqv9aogE4KX8Aey2NIBVcgMPjsXSPnOXRH0im%2Bq80XPF75N5s%2Fle1Q%2BHY5wHvWIgaAUDyQbrh8e8cM%2BZHhpkMy0UDt6YckW1a4GP8eNCeB3BVWg%3D%3D" -O train.zip
!wget -c "https://storage.googleapis.com/kaggle-competitions-data/kaggle/5441/test.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1529407229&Signature=AJoqcxiDSiRAlhI%2BdAoWQ6M86nGqTDyHLvfp%2BB2yWai1SnI4%2BBq3wFzYP%2BZQynGLbexUpZqy9LR8UixN3%2F1ZRrQu9d07Tv6f65WAr6w1fifzk%2BW39csagz1NCP1GHN6jqZS95tNmdXJCXXNjGaAI3IzggQr3L6OxdonF%2B%2B2FZjQWOy72jh8Vioi%2BYW%2FGLrAEjy2x5f2ueBvH2LfQBMdChl1pBLcIEwdvUDmIBs629yiFlBKpShiAPb%2BoNEkcBNTDKBUWv%2FiC2TWCsNxaHqMflXjRzc%2BjfoZmTWm1PeN2tbuKV3vfGur6oQvZKqp2yPeqReQhThIVr5y07UiXLBDyeg%3D%3D" -O test.zip
!wget -c "https://storage.googleapis.com/kaggle-competitions-data/kaggle/5441/sample_submission.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1529408376&Signature=lzVwBh%2B7NIDllvrngUveC3QKOct0KVt7i8IKut8rduWjMrILuyfOX9xqHV0vyqXqYiKanaJq5g6BzEhuPNZLRa9tdQFZkvPbD2shK%2BOgQZKWH0szUBUaws3mMdaCFHmqvsU3D%2BfgvFqW0ZP2F%2Bvu%2B9NARFt6xMom5Ku8LmsiOGvVZ8Hk3ZKDzZ0kR%2FBhZqShoDCY8Heht49GNsCKtU%2FR1i93ILBKCUluWO%2Fa1XB7%2F9X%2FHGlVh4RTu9yOoviSuq0x5zH%2BdEgfcLlrDzSj2Ch%2B16oLZDDfzQlAKhxy4M%2F3%2FGj7kuYGkTAY4a8gRUjoAttWxb7C7QbGRchTmgEkrwLGQA%3D%3D" -O DATASET/submission/sample_submission.csv
# 对数据集解压缩
!unzip train.zip -d DATASET/
!unzip test.zip -d DATASET/
import os
import re
import random
import shutil
from keras.preprocessing import image
import numpy as np
from tqdm import tqdm
from keras.applications import xception
from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input, decode_predictions
from keras.applications import inception_resnet_v2
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.inception_resnet_v2 import preprocess_input, decode_predictions
from keras.applications import densenet
from keras.applications.densenet import DenseNet201
from keras.applications.densenet import preprocess_input, decode_predictions
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Lambda
from keras.layers import Activation, Dropout, Flatten, Dense, Input, Dropout, BatchNormalization
from keras import backend as K
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import csv
import cv2
import h5py
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline
# 初始化数据集相关的所有的文件夹,包含生成的结果集
'''
数据集文件夹结构:
DATASET/
train/
dogs/
cats/
test/
pics/
exception/
submission/
'''
# 定义随机数种子,使得每次生成的随机数相同
np.random.seed(1024)
# 定义文件夹
DATASET = "DATASET"
train_dir = DATASET + '/' + 'train'
test_dir = DATASET + '/' + 'test'
submission_dir = DATASET + '/' + 'submission'
exception_dir = DATASET + '/' + 'exception'
def check_folders(dataset_dir, train_dir, test_dir, submis_dir, exception_dir):
if not os.path.exists(dataset_dir):
os.makedirs(dataset_dir)
print("dataset folder is created successfully...")
else:
print("dataset folder already created, please check!")
if not os.path.exists(dataset_dir + "/" + train_dir):
os.makedirs(dataset_dir + "/" + train_dir)
print("train folder is created successfully...")
else:
print("train folder already created, please check!")
if not os.path.exists(dataset_dir + "/" + test_dir):
os.makedirs(dataset_dir + "/" + test_dir)
print("test folder is created successfully...")
else:
print("test folder already created, please check!")
if not os.path.exists(dataset_dir + "/" + submis_dir):
os.makedirs(dataset_dir + "/" + submis_dir)
print("submission folder is created successfully...")
else:
print("submission folder already created, please check!")
if not os.path.exists(dataset_dir + "/"+ exception_dir):
os.makedirs(dataset_dir + "/"+ exception_dir)
print("exception folder is created successfully...")
else:
print("exception folder already created, please check!")
check_folders('DATASET', 'train', 'test', 'submission', 'exception')
# 下载好的文件"ImageNetFullClasses.csv"位于DATASET目录下,需要将其转换为字典格式存储起来
classes_file_path = "DATASET/ImageNetFullClasses.csv"
ImageNet_full_classes_dict = {}
with open(classes_file_path, "r", encoding='UTF-8') as csv_reader:
reader = csv.reader(csv_reader)
for line in reader:
i_key = line[0]
i_value = line[1]
ImageNet_full_classes_dict[i_key] = i_value
print(ImageNet_full_classes_dict)
# 图片尺寸
img_size = (299,299)
img_size2 = (224,224)
img_shape = (img_size[0], img_size[1], 3)
img_shape2 = (img_size2[0], img_size2[1], 3)
train_path = "DATASET/train"
#train_path = "train"
test_path = "DATASET/test"
# 获取指定路径文件名称,返回一个文件名列表
def get_file_names(path):
files = os.listdir(path)
return files
# 根据图片路径以及尺寸获取图片数据
def read_img(fp, img_shape):
img = image.load_img(fp, target_size = img_shape)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
return(x)
# 获取所有图片的预处理输出,并显示进度
def get_img(file_list, img_shape):
return [read_img(i, img_shape) for i in tqdm(file_list)]
file_list = [train_path + "/" + i for i in get_file_names(train_path)]
file_list = [i for i in file_list if os.path.isfile(i)]
test_list = [test_path + "/" + i for i in get_file_names(test_path)]
test_list = [i for i in test_list if os.path.isfile(i)]
# 对file_list按路径中的数字排序
## 定义正则
re_digits = re.compile(r'(\d+)')
## 定义排序函数
def sorted_numbers(s):
pieces=re_digits.split(s)
pieces[1::2]=map(int,pieces[1::2])
return pieces
def sort_list_with_numbers(alist):
return sorted(alist, key=sorted_numbers)
file_list = sort_list_with_numbers(file_list)
# print(get_img(file_list, img_shape))
# 按详细路径显示图片
def show_pic(full_path):
#img = cv2.resize(mpimg.imread(full_path), (224, 224))
img = mpimg.imread(full_path)
plt.imshow(img)
plt.axis("on")
plt.title(full_path)
plt.show()
# print(file_list)
# 确认train和test图片集中文件数量
print("train图片集中图片数量:{},test图片集中图片数量:{}".format(len(file_list), len(test_list)))
# 随机查看train和test中的各5张图片
file_list_sample = random.sample(file_list, 5)
test_list_sample = random.sample(test_list, 5)
show_list = file_list_sample
show_list.extend(test_list_sample)
plt.figure(figsize=(16, 8))
for i in range(len(show_list)):
plt.subplot(2,5,i+1)
img = mpimg.imread(show_list[i])
img = cv2.resize(img, (224, 224))
plt.axis("on")
plt.title(show_list[i])
plt.imshow(img)
# 获取猫和狗的具体类别,输入:字典dic,主要是前面已经获取到的ImageNet_full_classes_dict,输出:只含猫和狗的具体类别code列表
def get_classes(dic):
cat_dog_classes_code = []
if dic:
for k in dic:
if dic[k] == "猫" or dic[k] == "狗":
cat_dog_classes_code.append(k)
else:
print("ImageNet_full_classes_dict is empty, please check!")
return cat_dog_classes_code
# print(ImageNet_full_classes_dict)
valid_classes = get_classes(ImageNet_full_classes_dict)
print(valid_classes)
# 获取预测结果
def get_preds(model, X, top = 40):
results = []
model = model
for x in tqdm(X):
pred = model.predict(x)
pred_decode = decode_predictions(pred, top = top)[0]
results.append(pred_decode)
return results
# 使用keras中的预训练模型Xception作为非猫非狗异常图片的筛选器
X = get_img(file_list, img_shape)
X2 = get_img(file_list, img_shape2)
# model: Xception
model_xception = Xception(weights='imagenet')
# model: DenseNet201
model_DenseNet201 = DenseNet201(weights='imagenet')
# model: InceptionResNetV2
model_InceptionResNetV2 = InceptionResNetV2(weights='imagenet')
# 定义函数:找出非猫非狗的图片,返回except_index,格式:[0, 0, 1, 1, 1],假设有5张图片,0表示正常,1表示异常
def get_except_index(preds):
except_index = []
right_index = []
right_list_num = []
for pred in preds:
right_list_num = [item[0] for item in pred]
right_list_num = [1 for item in right_list_num if item in valid_classes]
right_index.append(sum(right_list_num))
for i in right_index:
if i == 0:
except_index.append(1)
else:
except_index.append(0)
return except_index
def exception_info(model, top = 50, shape=img_shape):
shape_224 = (224,224,3)
if list(shape)==list(shape_224):
inpt = X2
else:
inpt = X
preds = get_preds(model, inpt, top = top)
except_index_name = get_except_index(preds)
except_index_sum = sum(except_index_name)
# 异常图片数量
print("图片总数:{}, top = {}时,异常图片的数量:{}".format(len(except_index_name), top, except_index_sum))
return except_index_name
# to show exception pics preds when top = 3
except_index_xception_3 = exception_info(model_xception, top = 3)
# to show exception pics preds when top = 10
except_index_xception_10 = exception_info(model_xception, top = 10)
# to show exception pics preds when top = 30
except_index_xception_30 = exception_info(model_xception, top = 30)
# to show exception pics preds when top = 40
except_index_xception_40 = exception_info(model_xception, top = 40)
# to show exception pics preds when top = 50
except_index_xception_50 = exception_info(model_xception, top = 50)
# 观察top为不同数值时对应的异常图片数量,为了不漏删,同时减少可能误判的数量,计划将top设置为10
# to show exception pics preds when top = 60
except_index_xception_60 = exception_info(model_xception, top = 60)
except_index_DenseNet201_40 = exception_info(model_DenseNet201, top = 40, shape=img_shape2)
except_index_InceptionResNetV2_40 = exception_info(model_InceptionResNetV2, top = 40)
# 计算异常图片列表exception_total
ex1 = np.array(except_index_xception_40)
ex2 = np.array(except_index_DenseNet201_40)
ex3 = np.array(except_index_InceptionResNetV2_40)
exception_total = ex1 + ex2 + ex3
exception_total = list(exception_total)
# print(exception_total)
# 显示异常图片
except_list = []
for i,v in enumerate(exception_total):
if v > 0:
except_list.append(file_list[i])
show_pic(file_list[i])
print("异常图片数量:{}, 异常图片显示如上,异常文件列表:{}".format(len(except_list), except_list))
# 将异常图片移至'DATASET/exception'文件夹,存档
if len(except_list) > 0:
for i in except_list:
shutil.move(i, exception_dir)
# 生成新的file_list,并作为训练和验证的基准图片库
file_list = [train_path + "/" + i for i in get_file_names(train_path)]
test_list = [test_path + "/" + i for i in get_file_names(test_path)]
np.random.seed(1024)
cat_list = []
dog_list = []
for i in file_list:
match_cat = re.findall(r'\bcat\.', i)
match_dog = re.findall(r'\bdog\.', i)
if match_cat:
cat_list.append(i)
elif match_dog:
dog_list.append(i)
print("处理后的总图片数:{}, 其中猫的数量:{},狗的数量:{}".format(len(cat_list)+len(dog_list), len(cat_list), len(dog_list)))
# 分别建立训练集中的dog文件夹、cat文件夹和测试集文件夹pics
train_dog_dir = DATASET + "/train/dogs"
train_cat_dir = DATASET + "/train/cats"
test_pics_dir = DATASET + "/test/pics"
if not os.path.exists(train_dog_dir):
os.makedirs(train_dog_dir)
if not os.path.exists(train_cat_dir):
os.makedirs(train_cat_dir)
if not os.path.exists(test_pics_dir):
os.makedirs(test_pics_dir)
for i in cat_list:
shutil.move(i, DATASET + "/train/cats")
for i in dog_list:
shutil.move(i, DATASET + "/train/dogs")
for i in test_list:
shutil.move(i, DATASET + "/test/pics")
def write_gap(MODEL, img_size, lambda_func=None):
img_shape = (img_size[0], img_size[1], 3)
input_tensor = Input(img_shape)
x = input_tensor
if lambda_func:
x = Lambda(lambda_func)(x)
# 加载图片
data_gen = ImageDataGenerator()
X_train_gen = data_gen.flow_from_directory(train_dir, img_size, shuffle = False,batch_size = 16)
X_test_gen = data_gen.flow_from_directory(test_dir, img_size, shuffle = False,batch_size = 16, classes = None)
base_model = MODEL(input_tensor=x, weights='imagenet', include_top=False)
model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))
# model.summary()
# 训练特征向量
train = model.predict_generator(X_train_gen, verbose = 1)
test = model.predict_generator(X_test_gen, verbose = 1)
# 保存特征向量
with h5py.File('saved_models/weights.{}.hdf5'.format(MODEL.__name__)) as h:
h.create_dataset('train', data = train)
h.create_dataset('test', data = test)
h.create_dataset('label', data = X_train_gen.classes)
write_gap(Xception, (299,299), xception.preprocess_input)
write_gap(InceptionResNetV2, (299, 299), inception_resnet_v2.preprocess_input)
write_gap(DenseNet201, (224, 224), densenet.preprocess_input)
X_train = []
X_test = []
weights_list = ["saved_models/weights.DenseNet201.hdf5", "saved_models/weights.InceptionResNetV2.hdf5", "saved_models/weights.Xception.hdf5"]
for w in weights_list:
with h5py.File(w, 'r') as h:
X_train.append(np.array(h['train']))
X_test.append(np.array(h['test']))
y_train = np.array(h['label'])
X_train = np.concatenate(X_train, axis=1)
X_test = np.concatenate(X_test, axis=1)
# 将X_train, y_train进行混洗,以便后续的validation_split使用,numpy的随机种子使用前面定义的
X_train, y_train = shuffle(X_train, y_train)
input_tensor = Input(X_train.shape[1:])
x = Dropout(0.85)(input_tensor)
x = Dense(1, activation='sigmoid')(x)
model = Model(input_tensor, x)
model.compile(optimizer='adadelta',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
from graphviz import Source
src = Source('digraph G {\
node [shape=record]\
a[label="DenseNet201|{input:|output:}|{(224, 224, 3)|(2048)}"]\
b[label="InceptionResNetV2|{input:|output:}|{(299, 299, 3)|(2048)}"]\
c[label="Xception|{input:|output:}|{(299, 299, 3)|(2048)}"]\
Merge[label="Merge|{input:|output:}|{(3, 2048)|(5504)}"]\
Dropout[label="Dropout|Rate:|0.5"]\
Dense[label="Dense|{input:|output:}|{(5504)|(1024)}"]\
Output[label="Output|{input:|output:}|{(1024)|(1)}"]\
Image -> a -> Merge\
Image -> b -> Merge\
Image -> c -> Merge\
Merge -> Dense -> Dropout -> Output\
}')
src
# 定义超参数
epochs = 10
batch_size = 16
# 训练模型并保存模型
checkpointer = ModelCheckpoint(filepath='saved_models/weights.final_model.hdf5',verbose=2, save_best_only=True)
his_model = model.fit(X_train, y_train, validation_split = 0.2,
epochs = epochs, batch_size = batch_size, verbose=2,
callbacks=[checkpointer])
# 模型loss,val_loss曲线
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(his_model.history['loss'])
plt.plot(his_model.history['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()
# 模型acc,val_acc曲线
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 2)
plt.plot(his_model.history['acc'])
plt.plot(his_model.history['val_acc'])
plt.legend(['acc', 'val_acc'])
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.show()
y_test = model.predict(X_test, verbose=1)
y_test = y_test.clip(min = 0.005, max = 0.995)
data_gen = ImageDataGenerator()
X_test_gen = data_gen.flow_from_directory(test_dir, img_size, shuffle = False,batch_size = 16, classes = None)
import pandas as pd
from keras.preprocessing.image import *
# 保存数据到csv文件中
csv_file = submission_dir + "/sample_submission.csv"
df = pd.read_csv(csv_file)
re_digits = re.compile(r'(\d+)')
for i, fname in enumerate(X_test_gen.filenames):
index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
df.set_value(index-1, 'label', y_test[i])
df.to_csv(submission_dir + '/pred_result.csv', index=None)
# 显示测试集中前10个预测结果
df.head(10)
# 查看test中的前10张图片
test_list_10 = ["DATASET/test/pics/" + i for i in os.listdir("DATASET/test/pics")][:10]
show_list = test_list_10
plt.figure(figsize=(16, 8))
for i in range(len(show_list)):
plt.subplot(2,5,i+1)
img = mpimg.imread(show_list[i])
img = cv2.resize(img, (224, 224))
plt.axis("on")
plt.title(show_list[i])
plt.imshow(img)